library(tidyverse) # for data import, manipulation, viz
library(corrplot) # for correlation matix
library(stargazer) # visualize model fit
library(skimr) # generate summary statistics
library(car) # statistics
library(lmtest) # linear modeling
str(crime_df)
wages_df = crime_df %>% mutate(crmrte_shift = crmrte + 1) %>%
select(crmrte_shift, wcon, wtuc, wtrd, wfir, wser, wmfg, wfed, wsta, wloc, taxpc) %>%
mutate_all(., log)
cor(wages_df)
wcon: construction wages have high positive correlation with all wages but no correlation with wsta and wser. wtuc: utility wages have high positive correlation with all wages but no correlation with wsta and wser. wtrd: retail wages have high positive correlation with all wages but no correlation with wsta and wser. wfir: finance wages have high positive correlation with all wages but no correlation with wsta and wser. wser: service wages low correlation with all other wages and tax per capita wmfg: manufacturing wages have high positive correlation with all wages but no correlation with wsta and wser. wfed: fed employees wages have high positive correlation with all wages but no correlation with wsta and wser. wsta: state employees wages no correlation with any variable but a bit with fed employees wloc: local government employees wages have high correlation with all wages but wsta taxpc: no correlation with any
ggplot(wages_df, aes(x=log(wcon), y=log(crmrte))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
Wage Construction and Crime Rate have a high correlation
ggplot(wages_df, aes(x=log(wtuc), y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
ggplot(wages_df, aes(x=log(wtrd), y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
ggplot(wages_df, aes(x=log(wfir), y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
ggplot(wages_df, aes(x=log(wser), y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
ggplot(wages_df, aes(x=log(wmfg), y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
ggplot(wages_df, aes(x=log(wfed), y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
ggplot(wages_df, aes(x=log(wsta), y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
ggplot(wages_df, aes(x=log(wloc), y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
Wage Service and Crime Rate have no relationship. It could be because tips are an important part of the Service industry and they are not tracked easily.
ggplot(wages_df, aes(x=log(taxpc), y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
Include wloc and wsta and wtrd.
wtrd and wcon and wfir and wmfg
Then include taxpc.
data2 <- data2 %>% mutate(eff_cj = prbarr*prbconv*prbpris)
model1 = lm(log(crmrte + 1) ~ log(eff_cj) + log(wloc) + log(wsta) + log(wtrd), data = data2)
model1$coefficients
eff_cj_md = lm(log(crmrte) ~ log(eff_cj), data = data2)
eff_cj_md$coefficients
plot(eff_cj_md)
wloc_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc), data = data2)
wloc_md$coefficients
plot(wloc_md)
wloc_tax_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + log(taxpc), data = data2)
wloc_tax_md$coefficients
plot(wloc_tax_md)
wloc_trd_tax_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + log(wtrd) + log(taxpc), data = data2)
wloc_trd_md$coefficients
plot(wloc_trd_tax_md)
No noticeable improvement from adding wcon
wloc_con_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + log(wcon), data = data2)
wloc_con_md$coefficients
plot(wloc_con_md)
No noticeable improvement by adding wtrd
wloc_trd_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + log(wtrd), data = data2)
wloc_trd_md$coefficients
plot(wloc_trd_md)
No noticible improvement by adding wsta
wloc_sta_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + log(wsta), data = data2)
plot(wloc_sta_md)
wtrd_md = lm(log(crmrte) ~ log(eff_cj) + log(wtrd), data = data2)
wtrd_md$coefficients
(Intercept) log(eff_cj) log(wtrd)
-11.530276 -0.442810 1.249529
plot(wtrd_md)
demo_df = data2 %>% select(crmrte, eff_cj, wloc, wtrd, density, pctmin80, pctymle) %>%
mutate_all(., log)
cor(demo_df)
crmrte eff_cj wloc wtrd density pctmin80 pctymle
crmrte 1.0000000 -0.561313323 0.302867062 0.38965891 0.49364251 0.397290965 0.311754030
eff_cj -0.5613133 1.000000000 0.007855728 -0.09396993 -0.13895393 0.039442815 -0.372393035
wloc 0.3028671 0.007855728 1.000000000 0.57419433 0.30295286 0.015342980 0.022589116
wtrd 0.3896589 -0.093969928 0.574194331 1.00000000 0.42921817 0.046240863 -0.097848085
density 0.4936425 -0.138953934 0.302952858 0.42921817 1.00000000 -0.018819680 0.175156111
pctmin80 0.3972910 0.039442815 0.015342980 0.04624086 -0.01881968 1.000000000 0.008048105
pctymle 0.3117540 -0.372393035 0.022589116 -0.09784808 0.17515611 0.008048105 1.000000000
wloc_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc), data = data2)
wloc_md$coefficients
(Intercept) log(eff_cj) log(wloc)
-15.692389 -0.471548 1.872709
plot(wloc_md)
ymale_min_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + pctmin80 + log(pctymle), data = data2)
ymale_min_md$coefficients
(Intercept) log(eff_cj) log(wloc) pctmin80 log(pctymle)
-16.78712764 -0.50121797 2.09661595 0.01214476 0.23631352
plot(ymale_min_md)
ymale_density_min_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + pctmin80 + log(density) + log(pctymle), data = data2)
ymale_density_min_md$coefficients
(Intercept) log(eff_cj) log(wloc) pctmin80 log(density) log(pctymle)
-13.18459542 -0.47571392 1.41925751 0.01275751 0.15036226 0.09299417
plot(ymale_density_min_md)
Improves Heteroskedasticity
density_min_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + pctmin80 + log(density), data = data2)
density_min_md$coefficients
(Intercept) log(eff_cj) log(wloc) pctmin80 log(density)
-13.43963749 -0.48599161 1.41756663 0.01282803 0.15215117
plot(density_min_md)
Improves zero conditional mean and heteroskedasticity significantly
min_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + pctmin80 , data = data2)
min_md$coefficients
(Intercept) log(eff_cj) log(wloc) pctmin80
-17.55828744 -0.52860856 2.11310007 0.01230844
plot(min_md)
wtrd_min_md = lm(log(crmrte) ~ log(eff_cj) + log(wtrd) + log(pctmin80) , data = data2)
wtrd_min_md$coefficients
(Intercept) log(eff_cj) log(wtrd) log(pctmin80)
-11.8484351 -0.4577432 1.1746727 0.2311788
plot(wtrd_min_md)
No change after adding pctymle
ymale_md = lm(log(crmrte) ~ log(eff_cj) + log(wloc) + log(pctymle), data = data2)
ymale_md$coefficients
(Intercept) log(eff_cj) log(wloc) log(pctymle)
-14.7294721 -0.4371883 1.8555659 0.3048858
plot(ymale_md)
geo_df = data2 %>% mutate(ln_crmrte = log(crmrte), ln_eff_cj = log(eff_cj), ln_wloc = log(wloc), ln_wtrd = log(wtrd), ln_density = log(density), ln_taxpc = log(taxpc)) %>%
mutate(west_proper = ifelse(west == 1 & central != 1, 1, 0), central_proper = ifelse(west != 1 & central == 1, 1, 0)) %>%
select(ln_crmrte, ln_eff_cj, ln_wloc, ln_wtrd, ln_density, ln_taxpc, pctmin80, urban, west, central, west_proper)
cor(geo_df)
ln_crmrte ln_eff_cj ln_wloc ln_wtrd ln_density ln_taxpc pctmin80 urban west central west_proper
ln_crmrte 1.0000000 -0.561313323 0.302867062 0.38965891 0.49364251 0.33984322 0.23291821 0.49146445 -0.41439959 0.18471924 -0.45141763
ln_eff_cj -0.5613133 1.000000000 0.007855728 -0.09396993 -0.13895393 -0.26364343 0.17824194 -0.31631198 0.04357931 -0.05102979 0.07985026
ln_wloc 0.3028671 0.007855728 1.000000000 0.57419433 0.30295286 0.22092405 -0.10213445 0.33004924 -0.12518869 0.33062751 -0.15296513
ln_wtrd 0.3896589 -0.093969928 0.574194331 1.00000000 0.42921817 0.16350047 -0.07527956 0.39889779 -0.17146795 0.38409287 -0.19939002
ln_density 0.4936425 -0.138953934 0.302952858 0.42921817 1.00000000 0.10798692 -0.09668387 0.39272648 -0.21493695 0.28825936 -0.25035461
ln_taxpc 0.3398432 -0.263643431 0.220924053 0.16350047 0.10798692 1.00000000 0.02947733 0.39785937 -0.20386931 0.05466066 -0.19215618
pctmin80 0.2329182 0.178241937 -0.102134449 -0.07527956 -0.09668387 0.02947733 1.00000000 0.01619569 -0.63360382 -0.05487554 -0.62451443
urban 0.4914645 -0.316311979 0.330049244 0.39889779 0.39272648 0.39785937 0.01619569 1.00000000 -0.08681219 0.15927023 -0.08000341
west -0.4143996 0.043579312 -0.125188688 -0.17146795 -0.21493695 -0.20386931 -0.63360382 -0.08681219 1.00000000 -0.38987611 0.96990281
central 0.1847192 -0.051029790 0.330627510 0.38409287 0.28825936 0.05466066 -0.05487554 0.15927023 -0.38987611 1.00000000 -0.42986348
west_proper -0.4514176 0.079850262 -0.152965131 -0.19939002 -0.25035461 -0.19215618 -0.62451443 -0.08000341 0.96990281 -0.42986348 1.00000000
pctmin80 has high correlation with west but it has opposite impact on crmrte. urban is highly correlated with other variables, exclude. There is some overlap between west or central though they have opposity effects on crmrte. Parse variables. test: pctmin80 vs west_proper.
data2 = data2 %>% mutate(west_proper = ifelse(west == 1 & central != 1, 1, 0))
Benchmark
density_min_md = lm(log(crmrte) ~ log(eff_cj) + log(density) + pctmin80, data = data2)
density_min_md$coefficients
(Intercept) log(eff_cj) log(density) pctmin80
-5.24656874 -0.47332224 0.18038857 0.01219402
plot(density_min_md)
wtrd_west_md = lm(log(crmrte) ~ log(eff_cj) + log(density) + west_proper, data = data2)
wtrd_west_md$coefficients
(Intercept) log(eff_cj) log(density) west_proper
-4.6434990 -0.4077235 0.1375235 -0.4210135
plot(wtrd_west_md)
west
west_md = lm(log(crmrte) ~ log(eff_cj) + log(wtrd) + log(density) + west_proper, data = data2)
west_md$coefficients
(Intercept) log(eff_cj) log(wtrd) log(density) west_proper
-7.8303243 -0.4035657 0.5977962 0.1114841 -0.4007434
plot(west_md)
plot(density_min_md$fitted.values, log(data2$crmrte), main = "Actual vs Predicted")
abline(a=0,b=1)
plot(west_md$fitted.values, log(data2$crmrte), main = "Actual vs Predicted")
abline(a=0,b=1)
stargazer(density_min_md, wtrd_west_md, type = "text")
length of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changed
==========================================================
Dependent variable:
----------------------------
log(crmrte)
(1) (2)
----------------------------------------------------------
log(eff_cj) -0.473*** -0.408***
(0.056) (0.058)
log(density) 0.180*** 0.138***
(0.027) (0.029)
pctmin80 0.012***
(0.002)
west_proper -0.421***
(0.092)
Constant -5.247*** -4.643***
(0.189) (0.180)
----------------------------------------------------------
Observations 90 90
R2 0.628 0.591
Adjusted R2 0.615 0.577
Residual Std. Error (df = 86) 0.340 0.357
F Statistic (df = 3; 86) 48.480*** 41.393***
==========================================================
Note: *p<0.1; **p<0.05; ***p<0.01
crime_density_md$coefficients
(Intercept) log(eff_cj)
-5.8877067 -0.7604551
crime_density_2_md = lm(log(crmrte * density) ~ log(eff_cj) + log(pctmin80 * density), data = data2)
crime_density_2_md$coefficients
(Intercept) log(eff_cj) log(pctmin80 * density)
-7.7354360 -0.5585187 0.8657340
plot(crime_density_2_md$fitted.values, log(data2$crmrte), main = "Actual vs Predicted")
abline(a=0,b=1)
stargazer(density_min_md, wtrd_west_md, crime_density_2_md, type = "text")
length of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changednumber of rows of result is not a multiple of vector length (arg 2)
============================================================================================
Dependent variable:
--------------------------------------------------------------------
log(crmrte)
(1) (2) (3)
--------------------------------------------------------------------------------------------
log(eff_cj) -0.473*** -0.408*** -0.424***
(0.056) (0.058) (0.052)
log(density) 0.180*** 0.138***
(0.027) (0.029)
pctmin80 0.012***
(0.002)
west_proper -0.421***
(0.092)
log(pctmin80 * density) 0.195***
(0.021)
Constant -5.247*** -4.643*** -5.354***
(0.189) (0.180) (0.165)
--------------------------------------------------------------------------------------------
Observations 90 90 90
R2 0.628 0.591 0.662
Adjusted R2 0.615 0.577 0.654
Residual Std. Error 0.340 (df = 86) 0.357 (df = 86) 0.323 (df = 87)
F Statistic 48.480*** (df = 3; 86) 41.393*** (df = 3; 86) 85.029*** (df = 2; 87)
============================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01